/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import java.util.logging.*; import java.nio.channels.*; import net.nutch.io.*; import net.nutch.util.*; import net.nutch.pagedb.*; import net.nutch.linkdb.*; /*************************************************** * This is a wrapper class that allows us to reorder * write operations to the linkdb and pagedb. It is * useful only for objects like UpdateDatabaseTool, * which just does writes. * * The WebDBWriter is a traditional single-pass database writer. * It does not cache any instructions to disk (but it does * in memory, with possible resorting). It certainly does * nothing in a distributed fashion. * * There are other implementors of IWebDBWriter that do * all that fancy stuff. * * @author Mike Cafarella *************************************************/ public class DistributedWebDBWriter implements IWebDBWriter { static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBWriter"); static final byte CUR_VERSION = 0; static final byte OPEN_COUNTER_VERSION = 0; static final byte CLOSE_COUNTER_VERSION = 0; static final byte MACHINE_INFO_VERSION = 0; // magic number static int READY_TO_USE = 0xbabecafe; static int IS_COMPLETE = 0xbabe0000; static int WRITE_LOCK_INFO = 0xcafe0000; static long LONG_TIMEOUT = 10 * 1000; // db opcodes static final byte ADD_PAGE = 0; static final byte ADD_PAGE_WITH_SCORE = 1; static final byte ADD_PAGE_IFN_PRESENT = 2; static final byte DEL_PAGE = 3; static final int ADD_LINK = 0; static final int DEL_LINK = 1; static final int DEL_SINGLE_LINK = 2; // filenames static final String PAGES_BY_URL = "pagesByURL"; static final String PAGES_BY_MD5 = "pagesByMD5"; static final String LINKS_BY_URL = "linksByURL"; static final String LINKS_BY_MD5 = "linksByMD5"; static final String STATS_FILE = "stats"; static final String META_SHAREGROUP = "metashare"; static final String METAINFO = "metainfo"; // Result codes for page-url comparisons static final int NO_OUTLINKS = 0; static final int HAS_OUTLINKS = 1; static final int LINK_INVALID = 2; /******************************************** * PageInstruction holds an operation over a Page. *********************************************/ public static class PageInstruction implements WritableComparable { byte opcode; boolean hasLink; Page page; Link link; /** */ public PageInstruction() {} /** */ public PageInstruction(Page page, int opcode) { set(page, opcode); } /** */ public PageInstruction(Page page, Link link, int opcode) { set(page, link, opcode); } /** * Init from another PageInstruction object. */ public void set(PageInstruction that) { this.opcode = that.opcode; if (this.page == null) { this.page = new Page(); } this.page.set(that.page); if (this.link == null) { this.link = new Link(); } this.hasLink = that.hasLink; if (this.hasLink) { this.link.set(that.link); } } /** * Init PageInstruction with no Link */ public void set(Page page, int opcode) { this.opcode = (byte) opcode; this.page = page; this.hasLink = false; this.link = null; } /** * Init PageInstruction with a Link */ public void set(Page page, Link link, int opcode) { this.opcode = (byte) opcode; this.page = page; this.hasLink = true; this.link = link; } // // WritableComparable // public int compareTo(Object o) { int pageResult = this.page.compareTo(((PageInstruction) o).page); if (pageResult != 0) { return pageResult; } else { return this.opcode - (((PageInstruction) o).opcode); } } public void write(DataOutput out) throws IOException { out.writeByte(opcode); page.write(out); out.writeByte(hasLink ? 1 : 0); if (hasLink) { link.write(out); } } public void readFields(DataInput in) throws IOException { opcode = in.readByte(); if (page == null) { page = new Page(); } page.readFields(in); if (link == null) { link = new Link(); } hasLink = (1 == in.readByte()); if (hasLink) { link.readFields(in); } } public Page getPage() { return page; } public Link getLink() { if (hasLink) { return link; } else { return null; } } public int getInstruction() { return opcode; } /** * Sorts the instruction first by Page, then by opcode. */ public static class PageComparator extends WritableComparator { private static final Page.Comparator PAGE_COMPARATOR = new Page.Comparator(); public PageComparator() { super(PageInstruction.class); } /** Optimized comparator. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int opcode1 = b1[s1]; int opcode2 = b2[s2]; int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1); if (c != 0) return c; return opcode1 - opcode2; } } /***************************************************** * Sorts the instruction first by url, then by opcode. *****************************************************/ public static class UrlComparator extends WritableComparator { private static final Page.UrlComparator PAGE_COMPARATOR = new Page.UrlComparator(); public UrlComparator() { super(PageInstruction.class); } /** * We need to sort by ordered URLs. First, we sort by * URL, then by opcode. */ public int compare(WritableComparable a, WritableComparable b) { PageInstruction instructionA = (PageInstruction)a; PageInstruction instructionB = (PageInstruction)b; Page pageA = instructionA.getPage(); Page pageB = instructionB.getPage(); int result = pageA.getURL().compareTo(pageB.getURL()); if (result != 0) { return result; } else { return instructionA.opcode - instructionB.opcode; } } /** * Optimized comparator. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int opcode1 = b1[s1]; int opcode2 = b2[s2]; int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1); if (c != 0) return c; return opcode1 - opcode2; } } } /******************************************************** * PageInstructionWriter very efficiently writes a * PageInstruction to an EditSectionGroupWriter. Much better * than calling "writer.append(new PageInstruction())" ********************************************************/ public static class PageInstructionWriter { PageInstruction pi = new PageInstruction(); /** */ public PageInstructionWriter() { } /** * Append the PageInstruction info to the indicated SequenceFile, * and keep the PI for later reuse. */ public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Page page, int opcode, Writable val) throws IOException { pi.set(page, opcode); writer.append(pi, val); } /** * Append the PageInstruction info to the indicated SequenceFile, * and keep the PI for later reuse. */ public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Page page, Link link, int opcode, Writable val) throws IOException { pi.set(page, link, opcode); writer.append(pi, val); } } /************************************************************* * Reduce multiple instructions for a given url to the single effective * instruction. ADD is prioritized highest, then ADD_IFN_PRESENT, and then * DEL. Not coincidentally, this is opposite the order they're sorted in. **************************************************************/ private static class DeduplicatingPageSequenceReader { SequenceFile.Reader edits; PageInstruction current = new PageInstruction(); UTF8 currentUrl = new UTF8(); boolean haveCurrent; /** */ public DeduplicatingPageSequenceReader(SequenceFile.Reader edits) throws IOException { this.edits = edits; this.haveCurrent = edits.next(current, NullWritable.get()); } /** */ public boolean next(PageInstruction result) throws IOException { if (!haveCurrent) { return false; } currentUrl.set(current.getPage().getURL()); result.set(current); // take the first instruction do { // skip the rest } while ((haveCurrent = edits.next(current, NullWritable.get())) && currentUrl.compareTo(current.getPage().getURL()) == 0); return true; } } /************************************************* * Holds an instruction over a Link. *************************************************/ public static class LinkInstruction implements WritableComparable { Link link; int instruction; /** */ public LinkInstruction() { } /** */ public LinkInstruction(Link link, int instruction) { set(link, instruction); } /** * Re-init from another LinkInstruction's info. */ public void set(LinkInstruction that) { this.instruction = that.instruction; if (this.link == null) this.link = new Link(); this.link.set(that.link); } /** * Re-init with a Link and an instruction */ public void set(Link link, int instruction) { this.link = link; this.instruction = instruction; } // // WritableComparable // public int compareTo(Object o) { return this.link.compareTo(((LinkInstruction) o).link); } public void write(DataOutput out) throws IOException { out.writeByte(instruction); link.write(out); } public void readFields(DataInput in) throws IOException { this.instruction = in.readByte(); if (link == null) link = new Link(); link.readFields(in); } public Link getLink() { return link; } public int getInstruction() { return instruction; } /******************************************************* * Sorts the instruction first by Md5, then by opcode. *******************************************************/ public static class MD5Comparator extends WritableComparator { private static final Link.MD5Comparator MD5_COMPARATOR = new Link.MD5Comparator(); public MD5Comparator() { super(LinkInstruction.class); } public int compare(WritableComparable a, WritableComparable b) { LinkInstruction instructionA = (LinkInstruction)a; LinkInstruction instructionB = (LinkInstruction)b; return instructionA.link.md5Compare(instructionB.link); } /** Optimized comparator. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { return MD5_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1); } } /********************************************************* * Sorts the instruction first by url, then by opcode. *********************************************************/ public static class UrlComparator extends WritableComparator { private static final Link.UrlComparator URL_COMPARATOR = new Link.UrlComparator(); public UrlComparator() { super(LinkInstruction.class); } public int compare(WritableComparable a, WritableComparable b) { LinkInstruction instructionA = (LinkInstruction)a; LinkInstruction instructionB = (LinkInstruction)b; return instructionA.link.urlCompare(instructionB.link); } /** * Optimized comparator. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { return URL_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1); } } } /******************************************************* * LinkInstructionWriter very efficiently writes a * LinkInstruction to an EditSectionGroupWriter. Much better * than calling "writer.append(new LinkInstruction())" ********************************************************/ public static class LinkInstructionWriter { LinkInstruction li = new LinkInstruction(); /** */ public LinkInstructionWriter() { } /** * Append the LinkInstruction info to the indicated SequenceFile * and keep the LI for later reuse. */ public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Link link, int opcode, Writable val) throws IOException { li.set(link, opcode); writer.append(li, val); } } /******************************************************** * This class deduplicates link operations. We want to * sort by MD5, then by URL. But all operations * should be unique. *********************************************************/ class DeduplicatingLinkSequenceReader { Link currentKey = new Link(); LinkInstruction current = new LinkInstruction(); SequenceFile.Reader edits; boolean haveCurrent; /** */ public DeduplicatingLinkSequenceReader(SequenceFile.Reader edits) throws IOException { this.edits = edits; this.haveCurrent = edits.next(current, NullWritable.get()); } /** * The incoming stream of edits is sorted first by MD5, then by URL. * MD5-only values always come before MD5+URL. */ public boolean next(LinkInstruction key) throws IOException { if (! haveCurrent) { return false; } currentKey.set(current.getLink()); do { key.set(current); } while ((haveCurrent = edits.next(current, NullWritable.get())) && currentKey.compareTo(current.getLink()) == 0); return true; } } /************************************************** * The CloseProcessor class is used when we close down * the webdb. We give it the path, members, and class values * needed to apply changes to any of our 4 data tables. * * This is an abstract class. Each subclass must define * the exact merge procedure. However, file-handling * and edit-processing is standardized as much as possible. * **************************************************/ private abstract class CloseProcessor { String basename; String curDBPart; MapFile.Reader oldDb; EditSectionGroupWriter editWriter; SequenceFile.Sorter sorter; WritableComparator comparator; Class keyClass, valueClass; long itemsWritten = 0; /** * Store away these members for later use. */ CloseProcessor(String basename, MapFile.Reader oldDb, EditSectionGroupWriter editWriter, SequenceFile.Sorter sorter, WritableComparator comparator, Class keyClass, Class valueClass, String curDBPart) { this.basename = basename; this.oldDb = oldDb; this.editWriter = editWriter; this.sorter = sorter; this.comparator = comparator; this.keyClass = keyClass; this.valueClass = valueClass; this.curDBPart = curDBPart; } /** * Perform the shutdown sequence for this Processor. * There is a lot of file-moving and edit-sorting that * is common across all the 4 tables. * * Returns how many items were written out by this close(). */ long closeDown(NutchFile workingDir, NutchFile outputDir) throws IOException { // // Done adding edits, so close edit-writer. // editWriter.close(); // // Where the output is going // NutchFile sectionDir = new NutchFile(outputDir, "dbsection." + machineNum); NutchFile newDbNF = new NutchFile(sectionDir, basename); // // Grab all the edits that we need to process. We build an EditSectionGroupReader // and aim it at the right location. The ESR will wait until all its // component Sections are written and completed before returning from // any method (other than the constructor). So we expect to possibly wait // inside the call to numEdits(). // EditSectionGroupReader edits = new EditSectionGroupReader(nutchfs, dbName, basename, machineNum, totalMachines); int numEdits = edits.numEdits(); // If there are edits, then process them. if (numEdits != 0) { File mergedEditsFile = edits.mergeSectionComponents(); File sortedEditsFile = new File(mergedEditsFile.getPath() + ".sorted"); // Sort the edits long startSort = System.currentTimeMillis(); sorter.sort(mergedEditsFile.getPath(), sortedEditsFile.getPath()); long endSort = System.currentTimeMillis(); LOG.info("Processing " + basename + ": Sorted " + numEdits + " instructions in " + ((endSort - startSort) / 1000.0) + " seconds."); LOG.info("Processing " + basename + ": Sorted " + (numEdits / ((endSort - startSort) / 1000.0)) + " instructions/second"); // Delete old file mergedEditsFile.delete(); // Read the sorted edits. That means read all // the edits from the local subsection of the // database. We must merge every machine's // contribution to the edit-list first (which // also means waiting until each machine has // completed that step). // Read the sorted edits SequenceFile.Reader sortedEdits = new SequenceFile.Reader(sortedEditsFile.getPath()); // Create a brand-new output db for the integrated data File newDbFile = nutchfs.getWorkingFile(); MapFile.Writer newDb = (comparator == null) ? new MapFile.Writer(newDbFile.getPath(), keyClass, valueClass) : new MapFile.Writer(newDbFile.getPath(), comparator, valueClass); // Iterate through the edits, and merge changes with existing // db into the brand-new file oldDb.reset(); // Merge the edits. We did it! long startMerge = System.currentTimeMillis(); mergeEdits(oldDb, sortedEdits, newDb); long endMerge = System.currentTimeMillis(); LOG.info("Processing " + basename + ": Merged to new DB containing " + itemsWritten + " records in " + ((endMerge - startMerge) / 1000.0) + " seconds"); LOG.info("Processing " + basename + ": Merged " + (itemsWritten / ((endMerge - startMerge) / 1000.0)) + " records/second"); // Close down readers, writers sortedEdits.close(); newDb.close(); // Delete the (sorted) merged-edits sortedEditsFile.delete(); // Store the newly-written db file nutchfs.put(newDbNF, newDbFile, true); } else { // Otherwise, simply copy the original file into place, // without all the processing overhead. long startCopy = System.currentTimeMillis(); NutchFile srcSectionDir = new NutchFile(dbDir, "dbsection." + machineNum); NutchFile srcDbNF = new NutchFile(srcSectionDir, basename); File srcDbFile = nutchfs.get(srcDbNF); nutchfs.put(newDbNF, srcDbFile, true); long endCopy = System.currentTimeMillis(); LOG.info("Processing " + basename + ": Copied file (" + srcDbFile.length()+ " bytes) in " + ((endCopy - startCopy) / 1000.0) + " secs."); } // Delete the now-consumed edits file to save space edits.delete(); return itemsWritten; } /** * The loop that actually applies the changes and writes to * a new db. This is different for every subclass! */ abstract void mergeEdits(MapFile.Reader db, SequenceFile.Reader edits, MapFile.Writer newDb) throws IOException; } /*** * The PagesByURLProcessor is used during close() time for * the pagesByURL table. We instantiate one of these, and it * takes care of the entire shutdown process. */ private class PagesByURLProcessor extends CloseProcessor { EditSectionGroupWriter futureEdits; /** * We store "futureEdits" so we can write edits for the * next table-db step */ PagesByURLProcessor(MapFile.Reader db, EditSectionGroupWriter editWriter, EditSectionGroupWriter futureEdits) { super(PAGES_BY_URL, db, editWriter, new SequenceFile.Sorter(new PageInstruction.UrlComparator(), NullWritable.class), new UTF8.Comparator(), null, Page.class, "PagesByURLPart"); this.futureEdits = futureEdits; } /** * Merge the existing db with the edit-stream into a brand-new file. */ void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException { // Create the keys and vals we'll be using DeduplicatingPageSequenceReader edits = new DeduplicatingPageSequenceReader(sortedEdits); WritableComparable readerKey = new UTF8(); Page readerVal = new Page(); PageInstruction editItem = new PageInstruction(); int futureOrdering = 0; // Read the first items from both streams boolean hasEntries = db.next(readerKey, readerVal); boolean hasEdits = edits.next(editItem); // As long as we have both edits and entries, we need to // interleave them.... while (hasEntries && hasEdits) { int comparison = readerKey.compareTo(editItem.getPage().getURL()); int curInstruction = editItem.getInstruction(); // Perform operations if ((curInstruction == ADD_PAGE) || (curInstruction == ADD_PAGE_WITH_SCORE) || (curInstruction == ADD_PAGE_IFN_PRESENT)) { if (comparison < 0) { // Write readerKey, just passing it along. // Don't process the edit yet. newDb.append(readerKey, readerVal); itemsWritten++; hasEntries = db.next(readerKey, readerVal); } else if (comparison == 0) { // The keys are equal. If the instruction // is ADD_PAGE, we write the edit's key and // replace the old one. // // Otherwise, if it's ADD_IFN_PRESENT, // keep the reader's item intact. // if ((curInstruction == ADD_PAGE) || (curInstruction == ADD_PAGE_WITH_SCORE)) { // An ADD_PAGE with an identical pair // of pages replaces the existing one. // We may need to note the fact for // Garbage Collection. // // This happens in three stages. // 1. We write necessary items to the future // edits-list. // pagesByMD5Edits++; // If this is a replacing add, we don't want // to disturb the score from the old Page! This, // way, we can run some link analysis scoring // while the new Pages are being fetched and // not lose the info when a Page is replaced. // // If it is an ADD_PAGE_WITH_SCORE, then we // go ahead and replace the old one. // // Either way, from now on we treat it // as an ADD_PAGE // Page editItemPage = editItem.getPage(); if (curInstruction == ADD_PAGE) { editItemPage.setScore(readerVal.getScore(), readerVal.getNextScore()); } piwriter.appendInstructionInfo(futureEdits, editItemPage, ADD_PAGE, NullWritable.get()); // // 2. We write the edit-page to *this* table. // newDb.append(editItemPage.getURL(), editItemPage); // // 3. We want the ADD in the next step (the // MD5-driven table) to be a "replacing add". // But that won't happen if the readerItem and // the editItem Pages are not identical. // (In this scenario, that means their URLs // are the same, but their MD5s are different.) // So, we need to explicitly handle that // case by issuing a DELETE for the now-obsolete // item. if (editItemPage.compareTo(readerVal) != 0) { pagesByMD5Edits++; piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get()); } itemsWritten++; // "Delete" the readerVal by skipping it. hasEntries = db.next(readerKey, readerVal); } else { // ADD_PAGE_IFN_PRESENT. We only add IF_NOT // present. And it was present! So, we treat // this case like we treat a no-op. // Just move to the next edit. } // In either case, we process the edit. hasEdits = edits.next(editItem); } else if (comparison > 0) { // We have inserted a Page that's before some // entry in the existing database. So, we just // need to write down the Page from the Edit file. // It's like the above case, except we don't tell // the future-edits to delete anything. // // 1. Write the item down for the future. pagesByMD5Edits++; // // If this is an ADD_PAGE_IFN_PRESENT, then // we may also have a Link we have to take care of! // if (curInstruction == ADD_PAGE_IFN_PRESENT) { Link editLink = editItem.getLink(); if (editLink != null) { addLink(editLink); } } piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get()); // // 2. Write the edit-page to *this* table newDb.append(editItem.getPage().getURL(), editItem.getPage()); itemsWritten++; // Process the edit hasEdits = edits.next(editItem); } } else if (curInstruction == DEL_PAGE) { if (comparison < 0) { // Write the readerKey, just passing it along. // We don't process the edit yet. newDb.append(readerKey, readerVal); itemsWritten++; hasEntries = db.next(readerKey, readerVal); } else if (comparison == 0) { // Delete it! We can only delete one item // at a time, as all URLs are unique. // 1. Tell the future-edits what page will need to // be deleted. pagesByMD5Edits++; piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get()); // // 2. "Delete" the entry by skipping the Reader // key. hasEntries = db.next(readerKey, readerVal); // Process the edit hasEdits = edits.next(editItem); } else if (comparison > 0) { // Ignore it. We tried to delete an item that's // not here. hasEdits = edits.next(editItem); } } } // Now we have only edits. No more preexisting items! while (! hasEntries && hasEdits) { int curInstruction = editItem.getInstruction(); if (curInstruction == ADD_PAGE || curInstruction == ADD_PAGE_WITH_SCORE || curInstruction == ADD_PAGE_IFN_PRESENT) { // No more reader entries, so ADD_PAGE_IFN_PRESENT // is treated like a simple ADD_PAGE. // 1. Tell the future edits-list about this new item pagesByMD5Edits++; piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get()); // 2. Write the edit page to this table. newDb.append(editItem.getPage().getURL(), editItem.getPage()); itemsWritten++; } else if (curInstruction == DEL_PAGE) { // Ignore it. We tried to delete an item // that's not here. } // Either way, we always process the edit. hasEdits = edits.next(editItem); } // Now we have only preexisting items. We just copy // them to the new file, in order. while (hasEntries && ! hasEdits) { newDb.append(readerKey, readerVal); itemsWritten++; hasEntries = db.next(readerKey, readerVal); } } } /*** * The PagesByMD5Processor is used during close() time for * the pagesByMD5 table. We instantiate one of these, and it * takes care of the entire shutdown process. */ private class PagesByMD5Processor extends CloseProcessor { /** */ PagesByMD5Processor(MapFile.Reader db, EditSectionGroupWriter editWriter) { super(PAGES_BY_MD5, db, editWriter, new SequenceFile.Sorter(new PageInstruction.PageComparator(), NullWritable.class), null, Page.class, NullWritable.class, "PagesByMD5Part"); } /** */ void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException { // Create the keys and vals Page readerItem = new Page(); PageInstruction editItem = new PageInstruction(); // For computing the GC list Page deletedItem = new Page(), lastItem = new Page(); boolean justDeletedItem = false; boolean newReaderItem = false; int itemRepeats = 0; // Read the first items from both streams boolean hasEntries = db.next(readerItem, NullWritable.get()); boolean hasEdits = sortedEdits.next(editItem, NullWritable.get()); if (hasEntries) { // The first thing we read should become // the "previous key". We need this for // garbage collection. outBuf.reset(); readerItem.write(outBuf); inBuf.reset(outBuf.getData(), outBuf.getLength()); lastItem.readFields(inBuf); itemRepeats = 0; } // As long we have both edits and entries, we need to // interleave them. while (hasEdits && hasEntries) { int comparison = readerItem.compareTo(editItem.getPage()); int curInstruction = editItem.getInstruction(); // // OK! Now perform operations // if (curInstruction == ADD_PAGE) { if (comparison < 0) { // Write readerItem, just passing it along. // Don't process the edit yet. newDb.append(readerItem, NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); newReaderItem = true; } else if (comparison == 0) { // // This is a "replacing ADD", which is generated // by the above-sequence. We should skip over the // existing item, and add the new one instead. // // Note that by this point, the new version of the // Page from the edit sequence is guaranteed to // have the correct score. We make sure of it in // the mergeEdits() for PagesByURLProcessor. // newDb.append(editItem.getPage(), NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); newReaderItem = true; hasEdits = sortedEdits.next(editItem, NullWritable.get()); } else if (comparison > 0) { // Write the edit item. We've inserted an item // that comes before any others. newDb.append(editItem.getPage(), NullWritable.get()); itemsWritten++; hasEdits = sortedEdits.next(editItem, NullWritable.get()); } } else if (curInstruction == ADD_PAGE_IFN_PRESENT) { throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index: " + editItem); } else if (curInstruction == DEL_PAGE) { if (comparison < 0) { // Write the readerKey, just passing it along. // Don't process the edit yet. newDb.append(readerItem, NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); newReaderItem = true; } else if (comparison == 0) { // Delete it! Remember only one entry can // be deleted at a time! // // "Delete" the entry by skipping over the reader // item. We move onto the next item in the existing // index, as well as the next edit instruction. hasEntries = db.next(readerItem, NullWritable.get()); newReaderItem = true; hasEdits = sortedEdits.next(editItem, NullWritable.get()); // We need to set this flag for GC'ing. justDeletedItem = true; } else if (comparison > 0) { // This should never happen! We should only be // deleting items that actually appear! throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem); } } // GARBAGE COLLECTION // We want to detect when we have deleted the // last MD5 of a certain value. We can have // multiple MD5s in the same index, as long as // they have different URLs. When the last MD5 // is deleted, we want to know so we can modify // the LinkDB. if (newReaderItem) { // If we have a different readerItem which is just // the same as our last one, then we know it's a // repeat! if (hasEntries && readerItem.getMD5().compareTo(lastItem.getMD5()) == 0) { itemRepeats++; } else { // The current readerItem and the lastItem // MD5s are not equal. // // If the last item was deleted, AND if the // deleted item is not a repeat of the current item, // then that MD5 should be garbage collected. if (justDeletedItem && itemRepeats == 0) { deleteLink(lastItem.getMD5()); } // The current readerItem is the new "last key". outBuf.reset(); readerItem.write(outBuf); inBuf.reset(outBuf.getData(), outBuf.getLength()); lastItem.readFields(inBuf); itemRepeats = 0; } // Clear "new-reader-item" bit newReaderItem = false; } // Clear "last-deleted" bit justDeletedItem = false; } // Now we have only edits. No more preexisting items! while (! hasEntries && hasEdits) { int curInstruction = editItem.getInstruction(); if (curInstruction == ADD_PAGE) { // Just write down the new page! newDb.append(editItem.getPage(), NullWritable.get()); itemsWritten++; } else if (curInstruction == ADD_PAGE_IFN_PRESENT) { throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index: " + editItem); } else if (curInstruction == DEL_PAGE) { // This should never happen! We should only be // deleting items that actually appear! throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem); } hasEdits = sortedEdits.next(editItem, NullWritable.get()); } // Now we have only preexisting items. We just copy them // to the new file, in order while (hasEntries && ! hasEdits) { // Simply copy through the remaining database items newDb.append(readerItem, NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); newReaderItem = true; } } } /** * The LinksByMD5Processor is used during close() for * the pagesByMD5 table. It processes all the edits to * this table, and also generates edits for the linksByURL * table. */ private class LinksByMD5Processor extends CloseProcessor { EditSectionGroupWriter futureEdits; /** */ public LinksByMD5Processor(MapFile.Reader db, EditSectionGroupWriter editWriter, EditSectionGroupWriter futureEdits) { super(LINKS_BY_MD5, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.MD5Comparator(), NullWritable.class), new Link.MD5Comparator(), Link.class, NullWritable.class, "LinksByMD5Part"); this.futureEdits = futureEdits; } /** * Merges edits into the md5-driven link table. Also generates * edit sequence to apply to the URL-driven table. */ void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException { WritableComparator comparator = new Link.MD5Comparator(); DeduplicatingLinkSequenceReader edits = new DeduplicatingLinkSequenceReader(sortedEdits); // Create the keys and vals we'll use LinkInstruction editItem = new LinkInstruction(); Link readerItem = new Link(); // Read the first items from both streams boolean hasEntries = db.next(readerItem, NullWritable.get()); boolean hasEdits = edits.next(editItem); // As long as we have both edits and entries to process, // we need to interleave them while (hasEntries && hasEdits) { int curInstruction = editItem.getInstruction(); // Perform operations if (curInstruction == ADD_LINK) { // When we add a link, we may replace a previous // link with identical URL and MD5 values. The // MD5FirstComparator will use both values. // int comparison = comparator.compare(readerItem, editItem.getLink()); if (comparison < 0) { // Write the readerKey, just passing it along. // Don't process the edit yet. newDb.append(readerItem, NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); } else if (comparison == 0) { // 1. Write down the item for table-edits if (futureEdits != null) { linksByURLEdits++; liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get()); } // 2. Write the new item, "replacing" the old one. // We move to the next edit instruction and move // past the replaced db entry. newDb.append(editItem.getLink(), NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); hasEdits = edits.next(editItem); } else if (comparison > 0) { // 1. Write down the item for table-edits if (futureEdits != null) { linksByURLEdits++; liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get()); } // 2. Write the new item. We stay at the current // db entry. newDb.append(editItem.getLink(), NullWritable.get()); itemsWritten++; hasEdits = edits.next(editItem); } } else if ((curInstruction == DEL_LINK) || (curInstruction == DEL_SINGLE_LINK)) { // When we delete a link, we might delete many // at once! We are interested only in the MD5 // here. If there are entries with identical MD5 // values, but different URLs, we get rid of them // all. int comparison = 0; if (curInstruction == DEL_LINK) { comparison = readerItem.getFromID().compareTo(editItem.getLink().getFromID()); } else { comparison = readerItem.md5Compare(editItem.getLink()); } if (comparison < 0) { // Write the readerKey, just passing it along. // Don't process the edit yet. newDb.append(readerItem, NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); } else if (comparison == 0) { // Delete it (or them!) // 1. Write the full instruction for the next // delete-stage. That includes the read-in // value // 2. "Delete" the entry by skipping the // readerKey. We DO NOT go to the next edit // instruction! There might still be more // entries in the database to which we should // apply this delete-edit. // // Step 1. Write entry for future table-edits if (futureEdits != null) { linksByURLEdits++; liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_LINK, NullWritable.get()); } // Step 2. // We might want to delete multiple MD5s with // a single delete() operation, so keep this // edit instruction around hasEntries = db.next(readerItem, NullWritable.get()); if (curInstruction == DEL_SINGLE_LINK) { hasEdits = edits.next(editItem); } } else if (comparison > 0) { // Ignore, move on to next instruction hasEdits = edits.next(editItem); } } } // Now we have only edits. No more preexisting items! while (! hasEntries && hasEdits) { int curInstruction = editItem.getInstruction(); if (curInstruction == ADD_LINK) { // 1. Write down the item for future table-edits if (futureEdits != null) { linksByURLEdits++; liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get()); } // 2. Just add the item from the edit list newDb.append(editItem.getLink(), NullWritable.get()); itemsWritten++; } else if (curInstruction == DEL_LINK) { // Ignore operation } // Move on to next edit hasEdits = edits.next(editItem); } // Now we have only preexisting items. Just copy them // to the new file, in order. while (hasEntries && ! hasEdits) { newDb.append(readerItem, NullWritable.get()); itemsWritten++; hasEntries = db.next(readerItem, NullWritable.get()); } } } /** * This class helps the LinksByURLProcessor test a list of * Page objects, sorted by URL, for outlink-counts. We query * this class with a series of questions, based on Links sorted * by target URL. */ private class TargetTester { MapFile.Reader pagedb; boolean hasPage = false; UTF8 pageURL = null; Page page = null; /** */ public TargetTester(MapFile.Reader pagedb) throws IOException { this.pagedb = pagedb; this.pageURL = new UTF8(); this.page = new Page(); this.hasPage = pagedb.next(pageURL, page); } /** * Match the given URL against the sorted series of Page URLs. */ public int hasOutlinks(UTF8 curURL) throws IOException { int returnCode = NO_OUTLINKS; int comparison = pageURL.compareTo(curURL); while (hasPage && comparison < 0) { hasPage = pagedb.next(pageURL, page); if (hasPage) { comparison = pageURL.compareTo(curURL); } } if (hasPage) { if (comparison == 0) { returnCode = (page.getNumOutlinks() > 0) ? HAS_OUTLINKS : NO_OUTLINKS; } else if (comparison > 0) { // // This situation indicates that the Link's // target page has been deleted, probably // because we repeatedly failed to fetch the URL. // So, we should delete the Link. // returnCode = LINK_INVALID; } } return returnCode; } /** */ public void close() throws IOException { pagedb.close(); } } /** * Closes down and merges changes to the URL-driven link * table. This does nothing fancy, and propagates nothing * to a further stage. There is no next stage! */ private class LinksByURLProcessor extends CloseProcessor { MapFile.Reader pageDb; EditSectionGroupWriter futureEdits; /** */ public LinksByURLProcessor(MapFile.Reader db, EditSectionGroupWriter editWriter, MapFile.Reader pageDb, EditSectionGroupWriter futureEdits) { super(LINKS_BY_URL, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.UrlComparator(), NullWritable.class), new Link.UrlComparator(), Link.class, NullWritable.class, "LinksByURLPart"); this.pageDb = pageDb; this.futureEdits = futureEdits; } /** */ public long closeDown(NutchFile workingDir, NutchFile outputDir) throws IOException { long result = super.closeDown(workingDir, outputDir); pageDb.close(); return result; } /** * Merge the existing db with the edit-stream into a brand-new file. */ void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException { WritableComparator comparator = new Link.UrlComparator(); // Create the keys and vals we'll use LinkInstruction editItem = new LinkInstruction(); Link readerItem = new Link(); // Read the first items from both streams boolean hasEntries = db.next(readerItem, NullWritable.get()); boolean hasEdits = sortedEdits.next(editItem, NullWritable.get()); TargetTester targetTester = new TargetTester(pageDb); // As long as we have both edits and entries to process, // we need to interleave them while (hasEntries && hasEdits) { int curInstruction = editItem.getInstruction(); if (curInstruction == ADD_LINK) { // When we add a link, we may replace a previous // link with identical URL and MD5 values. Our // comparator will test both // int comparison = comparator.compare(readerItem, editItem.getLink()); if (comparison < 0) { // Write the readerKey, just passing it along. // Don't process the edit yet. int linkTest = targetTester.hasOutlinks(readerItem.getURL()); if (linkTest == LINK_INVALID) { liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get()); targetOutlinkEdits++; } else { boolean oldOutlinkStatus = readerItem.targetHasOutlink(); boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS); // Do the conditional so we minimize unnecessary // mod-writes. if (oldOutlinkStatus != newOutlinkStatus) { readerItem.setTargetHasOutlink(newOutlinkStatus); liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get()); targetOutlinkEdits++; } newDb.append(readerItem, NullWritable.get()); itemsWritten++; } hasEntries = db.next(readerItem, NullWritable.get()); } else if (comparison == 0) { // Write the new item, "replacing" the old one. // We move to the next edit instruction and move // past the replaced db entry. Link editLink = editItem.getLink(); int linkTest = targetTester.hasOutlinks(editLink.getURL()); // Delete the edit/readerItem from the other table if it's // found to be invalid. if (linkTest == LINK_INVALID) { liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get()); } else { editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS); liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get()); newDb.append(editLink, NullWritable.get()); itemsWritten++; } targetOutlinkEdits++; hasEntries = db.next(readerItem, NullWritable.get()); hasEdits = sortedEdits.next(editItem, NullWritable.get()); } else if (comparison > 0) { // Write the new item. We stay at the current // db entry. Link editLink = editItem.getLink(); int linkTest = targetTester.hasOutlinks(editLink.getURL()); // Delete the edit from the other table if it's invalid if (linkTest == LINK_INVALID) { liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get()); } else { editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS); liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get()); newDb.append(editLink, NullWritable.get()); itemsWritten++; } targetOutlinkEdits++; hasEdits = sortedEdits.next(editItem, NullWritable.get()); } } else if (curInstruction == DEL_LINK) { // When we delete a link, we do it by MD5 and apply // it to the index first. A single delete instruction // may remove many items in the db, during the earlier // processing. However, unlike the index-processing stage, // here we can expect a new DEL instruction for every // item that we remove from the db. // int comparison = comparator.compare(readerItem, editItem.getLink()); if (comparison < 0) { // Write readerKey, just passing it along. Don't // process the edit yet. int linkTest = targetTester.hasOutlinks(readerItem.getURL()); // Delete the reader item if it's found to be invalid if (linkTest == LINK_INVALID) { liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get()); } else { readerItem.setTargetHasOutlink(linkTest == HAS_OUTLINKS); liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get()); newDb.append(readerItem, NullWritable.get()); itemsWritten++; } targetOutlinkEdits++; hasEntries = db.next(readerItem, NullWritable.get()); } else if (comparison == 0) { // "Delete" the item by passing by the readerKey. // We want a new entry, as well as the next instruction // to process. hasEntries = db.next(readerItem, NullWritable.get()); hasEdits = sortedEdits.next(editItem, NullWritable.get()); } else if (comparison > 0) { // Ignore, move on to next instruction hasEdits = sortedEdits.next(editItem, NullWritable.get()); } } } // Now we have only edits. No more preexisting items! while (! hasEntries && hasEdits) { int curInstruction = editItem.getInstruction(); if (curInstruction == ADD_LINK) { // // Add the item from the edit list. // // // Make sure the outlinks flag is set properly. // Link editLink = editItem.getLink(); int linkTest = targetTester.hasOutlinks(editLink.getURL()); if (linkTest == LINK_INVALID) { liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get()); } else { editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS); liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get()); newDb.append(editLink, NullWritable.get()); itemsWritten++; } targetOutlinkEdits++; } else if (curInstruction == DEL_LINK) { // Ignore operation } // Move on to next edit hasEdits = sortedEdits.next(editItem, NullWritable.get()); } // Now we have only preexisting items. Just copy them // to the new file, in order. while (hasEntries && ! hasEdits) { // // Simply copy the remaining database items. // // // First, make sure the 'outlinks' flag is set properly. // int linkTest = targetTester.hasOutlinks(readerItem.getURL()); if (linkTest == LINK_INVALID) { liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get()); targetOutlinkEdits++; } else { boolean oldOutlinkStatus = readerItem.targetHasOutlink(); boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS); if (oldOutlinkStatus != newOutlinkStatus) { readerItem.setTargetHasOutlink(newOutlinkStatus); liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get()); targetOutlinkEdits++; } // Now copy the object newDb.append(readerItem, NullWritable.get()); itemsWritten++; } // Move on to next hasEntries = db.next(readerItem, NullWritable.get()); } targetTester.close(); } } /** * Method useful for the first time we create a distributed db project. * Basically need to write down the number of dirs we can expect. */ public static void createDB(NutchFileSystem nutchfs, String dbName, int totalMachines) throws IOException { // // Check to see if the db already exists // NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo")); if (nutchfs.get(machineInfo, LONG_TIMEOUT) != null) { throw new IOException("Cannot create WebDB at nutchfs " + nutchfs + " with name " + dbName + ", as it already exists."); } // // Write down how many machines live in the distributed pool // File machineInfoFile = nutchfs.getWorkingFile(); DataOutputStream out = new DataOutputStream(new FileOutputStream(machineInfoFile)); try { out.write(MACHINE_INFO_VERSION); out.writeInt(totalMachines); } finally { out.close(); } nutchfs.put(machineInfo, machineInfoFile, true); // // Create the lower directory structures for each machine in pool. // for (int i = 0; i < totalMachines; i++) { NutchFile dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb")); NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + i); NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL); NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5); NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL); NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5); File pagesByURLFile = nutchfs.getWorkingFile(); File pagesByMD5File = nutchfs.getWorkingFile(); File linksByURLFile = nutchfs.getWorkingFile(); File linksByMD5File = nutchfs.getWorkingFile(); // // If we're creating the db, we make a zero-length file for each // db file // new MapFile.Writer(pagesByURLFile.getPath(), new UTF8.Comparator(), Page.class).close(); new MapFile.Writer(pagesByMD5File.getPath(), new Page.Comparator(), NullWritable.class).close(); new MapFile.Writer(linksByURLFile.getPath(), new Link.UrlComparator(), NullWritable.class).close(); new MapFile.Writer(linksByMD5File.getPath(), new Link.MD5Comparator(), NullWritable.class).close(); nutchfs.put(pagesByURLNF, pagesByURLFile, true); nutchfs.put(pagesByMD5NF, pagesByMD5File, true); nutchfs.put(linksByURLNF, linksByURLFile, true); nutchfs.put(linksByMD5NF, linksByMD5File, true); } // // Create the "ready-to-use" flag that tells all subsequent // WebDBWriters it's OK to proceed. // File readyToUseFile = nutchfs.getWorkingFile(); NutchFile readyToUse = new NutchFile(nutchfs, dbName, "standard", new File("readyToUse")); out = new DataOutputStream(new FileOutputStream(readyToUseFile)); try { out.writeInt(READY_TO_USE); // Magic number } finally { out.close(); } nutchfs.put(readyToUse, readyToUseFile, false); } PageInstructionWriter piwriter = new PageInstructionWriter(); LinkInstructionWriter liwriter = new LinkInstructionWriter(); DataInputBuffer inBuf = new DataInputBuffer(); DataOutputBuffer outBuf = new DataOutputBuffer(); NutchFileSystem nutchfs; String dbName; NutchFile dbDir, oldDbDir, newDbDir, tmpDir; NutchFile localWriteLock, globalWriteLock, closeCounter, openCounter; EditSectionGroupWriter pagesByURLWriter, pagesByMD5Writer, linksByURLWriter, linksByMD5Writer; MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5; long pagesByURLEdits = 0, pagesByMD5Edits = 0, linksByURLEdits = 0, linksByMD5Edits = 0, targetOutlinkEdits = 0; int machineNum, totalMachines; /** * Open the db files. */ public DistributedWebDBWriter(NutchFileSystem nutchfs, String dbName, int machineNum) throws IOException { // // Store the nutchfs. Build dir set. // this.nutchfs = nutchfs; this.dbName = dbName; this.machineNum = machineNum; this.dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb")); this.oldDbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb.old")); this.newDbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb.new")); this.tmpDir = new NutchFile(newDbDir, "tmp"); // // Wait indefinitely for "ready-to-use-flag". // NutchFile readyToUse = new NutchFile(nutchfs, dbName, "standard", new File("readyToUse")); nutchfs.get(readyToUse); ////////////////////////////////////////////////////////// // Locks ////////////////////////////////////////////////////////// // 1. Each dbsection has a lock so only one writer ever accesses // it at once. Lock the local one immediately. this.localWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("sectionLock." + machineNum)); nutchfs.lock(localWriteLock, true); // 2. A global writeLock protects writers that need to make // changes that affect many processors (such as moving dbDir or // deleting tmp). // // Readers will obtain this lock non-exclusively. When it comes // time for global changes to the db, writers will obtain it // exclusively. Readers need to leave before these changes can // be made. this.globalWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("globalWriteLock")); // 3. Not quite a lock, but related: the closeCounter, which // tracks how many processors have made it through the db close // sequence. This is protected by globalWriteLock. this.openCounter = new NutchFile(newDbDir, "openCounter"); this.closeCounter = new NutchFile(newDbDir, "closeCounter"); ////////////////////////////////////////////////////////// // Setup and Initialization ////////////////////////////////////////////////////////// // Load # of machines NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo")); File machineInfoFile = nutchfs.get(machineInfo); DataInputStream in = new DataInputStream(new FileInputStream(machineInfoFile)); try { in.read(); // version this.totalMachines = in.readInt(); } finally { in.close(); } // // Seize global lock // nutchfs.lock(globalWriteLock, true); // Now we use these locks to resolve any partially-completed // state directories from a previous run. // REMIND - mjc - Fixing/defining the db/newdb and tmp-delete // sequence is the most important next step! /*** File oldDbDirFile = nutchfs.get(oldDbDir, SHORT_TIMEOUT); if (oldDbDirFile != null) { File dbDirFile = nutchfs.get(dbDir, SHORT_TIMEOUT); if (dbDirFile != null) { throw new IOException("Impossible condition: directories " + oldDbDir + " and " + dbDir + " cannot exist simultaneously"); } File newDbDirFile = nutchfs.get(newDbDir, SHORT_TIMEOUT); if (newDbDirFile != null) { nutchfs.renameTo(newDbDir, dbDir); } nutchfs.delete(oldDbDir); } else { File newDbDirFile = nutchfs.get(newDbDir, SHORT_TIMEOUT); if (newDbDirFile != null) { nutchfs.delete(newDbDir); } } // Delete any partial edits from last time. if (nutchfs.get(tmpDir, LONG_TIMEOUT) != null) { nutchfs.delete(tmpDir); } ****/ // Load how many machines have started yet. If we're the // first one, then we have to create the EditSectionWriter // structures. int numOpens = 0; File openCounterFile = nutchfs.get(openCounter, LONG_TIMEOUT); if (openCounterFile != null) { in = new DataInputStream(new FileInputStream(openCounterFile)); try { in.read(); // version numOpens = in.readInt(); } finally { in.close(); } } else { openCounterFile = nutchfs.getWorkingFile(); } // Bump number by 1. DataOutputStream out = new DataOutputStream(new FileOutputStream(openCounterFile)); try { out.write(OPEN_COUNTER_VERSION); out.writeInt(numOpens + 1); } finally { out.close(); } nutchfs.put(openCounter, openCounterFile, true); // Check if we're the first ones to open. if (numOpens == 0) { // Build an edit-section for each of the 4 edit types EditSectionGroupWriter.createEditGroup(nutchfs, dbName, PAGES_BY_URL, totalMachines, EditSectionGroupWriter.URL_KEYSPACE); EditSectionGroupWriter.createEditGroup(nutchfs, dbName, PAGES_BY_MD5, totalMachines, EditSectionGroupWriter.MD5_KEYSPACE); EditSectionGroupWriter.createEditGroup(nutchfs, dbName, LINKS_BY_URL, totalMachines, EditSectionGroupWriter.URL_KEYSPACE); EditSectionGroupWriter.createEditGroup(nutchfs, dbName, LINKS_BY_MD5, totalMachines, EditSectionGroupWriter.MD5_KEYSPACE); // Remove the flag that tells readers it's OK to proceed NutchFile dirIsComplete = new NutchFile(dbDir, "dbIsComplete"); nutchfs.delete(dirIsComplete); } // These are the NutchFiles for this section of the read-only // db. NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + machineNum); NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL); NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5); NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL); NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5); // // Release the global lock // nutchfs.release(globalWriteLock); // Create Readers for the above NutchFiles this.pagesByURL = new MapFile.Reader(nutchfs.get(pagesByURLNF).getPath(), new UTF8.Comparator()); this.pagesByMD5 = new MapFile.Reader(nutchfs.get(pagesByMD5NF).getPath(), new Page.Comparator()); this.linksByURL = new MapFile.Reader(nutchfs.get(linksByURLNF).getPath(), new Link.UrlComparator()); this.linksByMD5 = new MapFile.Reader(nutchfs.get(linksByMD5NF).getPath(), new Link.MD5Comparator()); // Create writers for new edit-files. We write changes // into these files, then apply them to the db upon close(). this.pagesByURLWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, PAGES_BY_URL, PageInstruction.class, NullWritable.class, new EditSectionGroupWriter.PageURLExtractor()); this.pagesByMD5Writer = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, PAGES_BY_MD5, PageInstruction.class, NullWritable.class, new EditSectionGroupWriter.PageMD5Extractor()); this.linksByURLWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_URL, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkURLExtractor()); this.linksByMD5Writer = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_MD5, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkMD5Extractor()); } /** * Shutdown */ public synchronized void close() throws IOException { // Process the 4 tables: // 1. pagesByURL // 2. pagesByMD5 // 3. linksByMD5 // 4. linksByURL // 1. Process pagesByURL. Processing this stream will // generate a number of edits for the pagesByMD5 step. // CloseProcessor pagesByURLProcessor = new PagesByURLProcessor(pagesByURL, pagesByURLWriter, pagesByMD5Writer); long numPBUItems = pagesByURLProcessor.closeDown(tmpDir, newDbDir); // // 2. Process the pagesByMD5 edit stream. This will // make calls to deleteLink(), which are processed later. // CloseProcessor pagesByMD5Processor = new PagesByMD5Processor(pagesByMD5, pagesByMD5Writer); long numPBMItems = pagesByMD5Processor.closeDown(tmpDir, newDbDir); // // 3. Process the linksByMD5 edit stream first. This // will generate a number of edits for the linksByURL // stream. This also processes the calls to deleteLink() // that may have been invoked as part of the above call // to process pagesByMD5. CloseProcessor linksByMD5Processor = new LinksByMD5Processor(linksByMD5, linksByMD5Writer, linksByURLWriter); long numLBMItems = linksByMD5Processor.closeDown(tmpDir, newDbDir); // // 4. Process the linksByURL edit stream. This will also // read through the sorted PagesByURL file, and modify // the Links so that they indicated whether the target // Page has any outlinks or not. // // Duplicate the LINKS_BY_MD5 editsWriter, because the 1st one has // already been closed. EditSectionGroupWriter targetOutlinkEditsWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_MD5, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkMD5Extractor()); // Find the just-written dbsection output for PAGES_BY_URL NutchFile newSectionDir = new NutchFile(newDbDir, "dbsection." + machineNum); NutchFile newPagesByURLNF = new NutchFile(newSectionDir, PAGES_BY_URL); CloseProcessor linksByURLProcessor = new LinksByURLProcessor(linksByURL, linksByURLWriter, new MapFile.Reader(nutchfs.get(newPagesByURLNF).getPath(), new UTF8.Comparator()), targetOutlinkEditsWriter); long numLBUItems = linksByURLProcessor.closeDown(tmpDir, newDbDir); // // If the number of linksByURL processed is zero, then // there's no reason to do all of the following with // a 2nd pass through linksByMD5. // if (numLBUItems != 0) { // // 5. Step 4 did several things to the LinksByURL db. // First, it implemented all the changes generated // by instructions from LinksByMD5Processor. Second, // it made lots of calls to setTargetHasOutlink. This // changes the content of the Link objects. // // So now we need to reconstruct the LinksByMD5 // list, using the Links we created in step #4. // NutchFile newLinksByMD5NF = new NutchFile(newSectionDir, LINKS_BY_MD5); MapFile.Reader linksByMD5ForStageTwo = new MapFile.Reader(nutchfs.get(newLinksByMD5NF).getPath(), new Link.MD5Comparator()); NutchFile stageTwoDbDir = new NutchFile(newDbDir, "stage2.subdir"); CloseProcessor linksByMD5StageTwoProcessor = new LinksByMD5Processor(linksByMD5ForStageTwo, targetOutlinkEditsWriter, null); numLBMItems = linksByMD5StageTwoProcessor.closeDown(tmpDir, stageTwoDbDir); // // 6. Now move the Stage2 LinksByMD5 file up to replace // the one at the primary level // linksByMD5ForStageTwo.close(); NutchFile stageOneLinksByMD5 = new NutchFile(newDbDir, LINKS_BY_MD5); NutchFile stageTwoLinksByMD5 = new NutchFile(stageTwoDbDir, LINKS_BY_MD5); nutchfs.delete(stageOneLinksByMD5); nutchfs.renameTo(stageTwoLinksByMD5, stageOneLinksByMD5); } // // 7. Finally, write out the total num of pages and links // //NutchFile newSectionDir = new NutchFile(newDbDir, "dbsection." + machineNum); NutchFile sectionStats = new NutchFile(newSectionDir, STATS_FILE); File sectionStatsFile = nutchfs.getWorkingFile(); DataOutputStream out = new DataOutputStream(new FileOutputStream(sectionStatsFile)); try { // // These counts are guaranteed to be correct; they're // based on the counts made during processing of primary-key // edits. Pages are always counted by URL first, and only // subsequently by MD5 if there are any edits to make. Links // are always counted by MD5 first, and only by URL subsequently // and conditionally. // // If there are a bunch of edits that result in no modifications // to the db, the two sets of counts (one for URL, one for // MD5) could become out of sync. So we use the ones that // are sure to be accurate. // out.write(CUR_VERSION); out.writeLong(numPBUItems); out.writeLong(numLBMItems); } finally { out.close(); nutchfs.put(sectionStats, sectionStatsFile, true); } // Close down the db-readers pagesByURL.close(); pagesByMD5.close(); linksByMD5.close(); linksByURL.close(); ////////////////////////////////////////////////////////////// // Now we need to do a distributed-close. It works by // the "last person out turns off the lights" protocol. // All the processors but one will exit without doing anything. // The last one to exit does all the directory moves. ////////////////////////////////////////////////////////////// // // First step is to obtain the global writeLock exclusively. // DBReaders will try to obtain this non-exclusively. That // way, there can be many readers at once, but these must // leave before a single process can blow away the directories. // nutchfs.lock(globalWriteLock, true); // // Read in how many processes have closed already // int numCloses = 0; File closeCounterFile = nutchfs.get(closeCounter, LONG_TIMEOUT); if (closeCounterFile != null) { DataInputStream in = new DataInputStream(new FileInputStream(closeCounterFile)); try { in.read(); // version numCloses = in.readInt(); } finally { in.close(); } } else { closeCounterFile = nutchfs.getWorkingFile(); } if (numCloses == totalMachines) { throw new IOException("All the processors have already shut down. Impossible condition!"); } // Bump that number by 1. out = new DataOutputStream(new FileOutputStream(closeCounterFile)); try { out.write(CLOSE_COUNTER_VERSION); out.writeInt(numCloses + 1); } finally { out.close(); } nutchfs.put(closeCounter, closeCounterFile, true); // Check if this processor is the last one to close. if (numCloses == totalMachines - 1) { // Delete edits that might still be lingering around... for (int i = 0; i < totalMachines; i++) { new EditSectionGroupReader(nutchfs, dbName, PAGES_BY_URL, i, totalMachines).delete(); new EditSectionGroupReader(nutchfs, dbName, PAGES_BY_MD5, i, totalMachines).delete(); new EditSectionGroupReader(nutchfs, dbName, LINKS_BY_URL, i, totalMachines).delete(); new EditSectionGroupReader(nutchfs, dbName, LINKS_BY_MD5, i, totalMachines).delete(); } // Complete directories and move them into place nutchfs.completeDir(tmpDir); nutchfs.completeDir(dbDir); nutchfs.completeDir(newDbDir); // // Write out the "complete" flag, which tells // readers it's OK to proceed // File dirIsCompleteFile = nutchfs.getWorkingFile(); NutchFile dirIsComplete = new NutchFile(newDbDir, "dbIsComplete"); out = new DataOutputStream(new FileOutputStream(dirIsCompleteFile)); try { out.writeInt(IS_COMPLETE); // Magic number } finally { out.close(); } nutchfs.put(dirIsComplete, dirIsCompleteFile, true); // Here we need to 'finish' the db operation. // That involves: 1. Removing the tmpdir. // 2. Moving the dbDir to oldDbDir // 3. Renaming the newDbDir to dbDir // 4. Removing the oldDbDir // // 1. nutchfs.delete(tmpDir); // 2. nutchfs.renameTo(dbDir, oldDbDir); // 3. nutchfs.renameTo(newDbDir, dbDir); // 4. nutchfs.delete(oldDbDir); } // Done. nutchfs.release(globalWriteLock); nutchfs.release(localWriteLock); } ///////////////////// // Methods for adding, and managing, db operations //////////////////// /** * Add a page to the page database */ public synchronized void addPage(Page page) throws IOException { // The 2nd (byMD5) part is handled during processing of the 1st. pagesByURLEdits++; piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE, NullWritable.get()); } /** * Add a page to the page database, with a brand-new score */ public synchronized void addPageWithScore(Page page) throws IOException { // The 2nd (byMD5) part is handled during processing of the 1st. pagesByURLEdits++; piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_WITH_SCORE, NullWritable.get()); } /** * Don't replace the one in the database, if there is one. */ public synchronized void addPageIfNotPresent(Page page) throws IOException { // The 2nd (index) part is handled during processing of the 1st. pagesByURLEdits++; piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_IFN_PRESENT, NullWritable.get()); } /** * Don't replace the one in the database, if there is one. * * If we do insert the new Page, then we should also insert * the given Link object. */ public synchronized void addPageIfNotPresent(Page page, Link link) throws IOException { // The 2nd (index) part is handled during processing of the 1st. pagesByURLEdits++; piwriter.appendInstructionInfo(pagesByURLWriter, page, link, ADD_PAGE_IFN_PRESENT, NullWritable.get()); } /** * Remove a page from the page database. */ public synchronized void deletePage(String url) throws IOException { // The 2nd (index) part is handled during processing of the 1st. Page p = new Page(); p.setURL(url); pagesByURLEdits++; piwriter.appendInstructionInfo(pagesByURLWriter, p, DEL_PAGE, NullWritable.get()); } /** * Add a link to the link database */ public synchronized void addLink(Link lr) throws IOException { linksByMD5Edits++; liwriter.appendInstructionInfo(linksByMD5Writer, lr, ADD_LINK, NullWritable.get()); } /** * Remove links with the given MD5 from the db. */ private synchronized void deleteLink(MD5Hash md5) throws IOException { linksByMD5Edits++; liwriter.appendInstructionInfo(linksByMD5Writer, new Link(md5, 0, "", ""), DEL_LINK, NullWritable.get()); } /** * The WebDBWriter.main() provides some handy methods for * testing the WebDB. */ public static void main(String argv[]) throws FileNotFoundException, IOException { if (argv.length < 2) { System.out.println("Usage: java net.nutch.db.DistributedWebDBWriter <db> [-create <numProcessors>] | <machineInt> ([-addpage id url] | [-addpageifnp id url] | [-deletepage url] | [-addlink fromID url] | [-deletelink fromID])"); return; } NutchFileSystem nutchfs = new NutchNFSFileSystem(new File(argv[0]), true); if ("-create".equals(argv[1])) { DistributedWebDBWriter.createDB(nutchfs, "db", Integer.parseInt(argv[2])); System.out.println("Created webdb at " + argv[0]); } else { int machineNum = Integer.parseInt(argv[1]); String cmd = argv[2]; if ("-addpage".equals(cmd)) { MD5Hash md5 = new MD5Hash(argv[3]); String url = argv[4]; DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum); Page page = new Page(url, md5); writer.addPageWithScore(page); System.out.println("Added page (with score): " + page); writer.close(); } else if ("-addpageifnp".equals(cmd)) { MD5Hash md5 = new MD5Hash(argv[3]); String url = argv[4]; DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum); try { Page page = new Page(url, md5); writer.addPageIfNotPresent(page); System.out.println("Added page: " + page); } finally { writer.close(); } } else if ("-deletepage".equals(cmd)) { String url = argv[3]; DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum); try { writer.deletePage(url.trim()); System.out.println("Deleted item(s)"); } finally { writer.close(); } } else if ("-addlink".equals(cmd)) { MD5Hash fromID = new MD5Hash(argv[3]); String url = argv[4]; DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum); try { Link link = new Link(fromID, MD5Hash.digest("randomstring.com").halfDigest(), url, "SomeRandomAnchorText_" + System.currentTimeMillis()); writer.addLink(link); System.out.println("Added link: " + link); } finally { writer.close(); } } else if ("-deletelink".equals(cmd)) { MD5Hash fromID = new MD5Hash(argv[3]); DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum); try { writer.deleteLink(fromID); System.out.println("Deleted item(s)"); } finally { writer.close(); } } else { System.out.println("Sorry, no command with name " + argv[1]); } } } }